sp_tree <- tree %>%
ggtree(aes(color = supergroup),
linewidth = 1.0,
layout = "rectangular"
) %<+% tree_meta$tips +
geom_tiplab(align = TRUE) +
scale_color_manual(values = SUPERGROUP_COLS) +
geom_treescale() +
scale_x_continuous(expand = expansion(0.2)) +
scale_y_tree()
## Warning: `expand_scale()` was deprecated in ggplot2 3.3.0.
## ℹ Please use `expansion()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Scale for y is already present.
## Adding another scale for y, which will replace the existing scale.
# Adjust the second plot to align with the tree and facet grid titles properly
hm_for_tree_id <- ggtreeplot(sp_tree, og_by_id_meta, data_label = "id", expand_limits = expand_scale(0, 1.2)) +
geom_tile(aes(x = "Mean COGEQC hscore scaled vs all", fill = cogeqc_hscore_scaled_all_mean)) +
scale_fill_gradient2(low = "#2166AC", high = "#B2182B", mid = "#F7F7F7", midpoint = median(og_by_id_meta$cogeqc_hscore_scaled_all_mean, na.rm = TRUE), guide = guide_colorbar(order = 1)) +
new_scale_fill() +
geom_tile(aes(x = "Mean OG size", fill = total_parent_seqs_mean)) +
scale_fill_gradient2(low = "#FCFBFD", high = "#3F007D", guide = guide_colorbar(order = 2)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs including species", fill = num_orthogroups)) +
scale_fill_gradient2(low = "#EFF3FF", high = "#08519C", guide = guide_colorbar(order = 3)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with < 10 genes that include species", fill = parent_seqs_below_10_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 4)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 10-50 genes that include species", fill = parent_seqs_from_10_to_50_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 5)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 50-100 genes that include species", fill = parent_seqs_from_50_to_100_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 6)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 100-200 genes that include species", fill = parent_seqs_from_100_to_200_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 7)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 200-500 genes that include species", fill = parent_seqs_from_200_to_500_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 8)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 500-1000 genes that include species", fill = parent_seqs_from_500_to_1000_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 9)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with 1000-10000 genes that include species", fill = parent_seqs_from_1000_to_10000_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 10)) +
new_scale_fill() +
geom_tile(aes(x = "Num of OGs with > 10000 genes that include species", fill = parent_seqs_over_10000_sum)) +
scale_fill_gradient2(low = "#74C476", high = "#00441B", guide = guide_colorbar(order = 11)) +
theme_bw() +
theme(
legend.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
) +
facet_grid(. ~ OG_source, scales = "free_y", space = "free") +
no_y_axis()
tree_hm_for_tree_id <- sp_tree + hm_for_tree_id +
plot_annotation() +
plot_layout(guides = "collect") &
theme(legend.position = "bottom", legend.direction = "vertical")
## ggplot method
# plot the geom with y axis plotted using tree_y to reorder the y axis to match tree
hm_for_tree_id_gc <- ggtreeplot(sp_tree, og_by_id_gc_meta, aes(x = most_common_pfam_hmm_name), data_label = "id", expand_limits = expand_scale(0,1.2)) +
geom_tile(aes(fill = cogeqc_hscore_scaled_all_mean)) +
scale_fill_gradient2(
low = "#2166AC",
high = "#B2182B",
mid = "#F7F7F7",
midpoint = median(og_by_id_gc_meta$cogeqc_hscore_scaled_all_mean, na.rm = TRUE)
) +
theme_bw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(. ~ OG_source, scales = "free_y", space = "free") +
no_y_axis()
# now to align the tree tips with the y axis of geom
tree_hm_for_tree_id_gc <- sp_tree + hm_for_tree_id_gc +
plot_annotation() +
plot_layout(guides = "collect")
# plot by tools
vn_cogeqc_hssc_tools <- og_meta %>% ggviolin(
y = "cogeqc_hscore_scaled_all", x = "OG_source",
trim = TRUE,
add = "jitter",
fill = "OG_source"
) +
scale_fill_manual(values = OG_CALLER_COLS) +
labs(y = "Scaled homogeneity scores", x = "Source of orthogroups",
title = "Distribution of mean homogeneity scores for orthogroups") +
theme(plot.subtitle = ggtext::element_markdown())
# plot by species, color tools
vn_cogeqc_hssc_sp <- og_meta %>%
separate_longer_delim(id, delim = ",") %>%
separate_longer_delim(supergroup, delim = ",") %>%
unique() %>%
mutate(id = factor(id, levels = unique(main_seq_long %>% arrange(supergroup) %>% pull(id)))) %>%
ggplot(aes(x = id, y = cogeqc_hscore_scaled_all, fill = OG_source)) +
geom_violin(trim = TRUE, position = position_dodge(1)) +
scale_fill_manual(values = OG_CALLER_COLS) +
labs(y = "Scaled homogeneity scores", x = "Species",
title = "Distribution of mean homogeneity scores for orthogroups") +
theme(plot.subtitle = ggtext::element_markdown()) +
theme_bw()
# hack to label axis with supergroup
vn_cogeqc_hssc_sp <- vn_cogeqc_hssc_sp + axis_lab_id_sg +
plot_layout(ncol = 1, nrow = 2, heights = c(20, 0.5)) +
plot_layout(guides = "collect")
sp_ogs_coghs_vs_size_vs_sgs <- og_meta %>%
ggplot(aes(x = og_size,
y = cogeqc_hscore_scaled_all,
size = total_supergroups,
color = OG_source)) +
geom_point(alpha=0.7) +
scale_size(range = c(0, 10)) +
scale_color_manual(values = OG_CALLER_COLS) +
theme(legend.position = "none") +
theme_bw()
vn_size_tools <- og_meta %>%
ggviolin(y = "og_size", x = "OG_source",
trim = TRUE,
add = "jitter",
fill = "OG_source"
) +
scale_fill_manual(values = OG_CALLER_COLS) +
labs(y = "Num genes in OG", x = "Source of orthogroups") +
theme(plot.subtitle = ggtext::element_markdown()) +
scale_y_continuous(trans = 'log10', labels = scales::comma)
# Create the heatmap using ggplot2
jaccard_metrics_heat_callers <- jaccard_metrics_all_long %>%
ggplot(aes(x = tool1, y = tool2, fill = Value)) +
geom_tile() +
scale_fill_gradient2(low = "#2166AC", high = "#B2182B", mid = "#F7F7F7", midpoint = 0.5) +
facet_wrap(~Metric) +
theme_minimal() +
labs(
title = "Heatmaps of Metrics by Tool Pair",
x = "Tool 1", y = "Tool 2", fill = "Value"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme(strip.text = element_text(size = 12)) +
coord_fixed(ratio = 1)
# OG vs OG heatmap prep
hm_ogs_vs <- og_meta %>%
filter(OG_source == "orthofinder") %>%
mutate(Orthogroup = factor(Orthogroup, levels = unique(Orthogroup))) %>%
ggplot() +
geom_tile(aes(x = "COGEQC hscore scaled vs all", y = Orthogroup, fill = cogeqc_hscore_scaled_all)) +
scale_fill_gradient2(low = "#FFF5F0", high = "#67000D", limits = c(0, 1), midpoint = 0.25, guide = guide_colorbar(order = 5)) +
new_scale_fill() +
geom_tile(aes(x = "% of OG that is the most common Pfam", y = Orthogroup, fill = percent_most_common_pfam_hmm_name)) +
scale_fill_gradient2(low = "#FCFBFD", high = "#3F007D", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 1)) +
new_scale_fill() +
geom_tile(aes(x = "OG size", y = Orthogroup, fill = og_size)) +
scale_fill_gradient2(low = "#B2182B", high = "#2166AC", mid = "#F7F7F7", guide = guide_colorbar(order = 6)) +
new_scale_fill() +
geom_tile(aes(x = "Best JI vs ", y = Orthogroup, fill = Best_Jaccard)) +
scale_fill_gradient2(low = "#762A83", high = "#1B7837", mid = "#F7F7F7", limits = c(0, 1), midpoint = 0.5, guide = guide_colorbar(order = 4)) +
new_scale_fill() +
geom_tile(aes(x = "OG source", y = Orthogroup, fill = OG_source)) +
scale_fill_manual(values = OG_CALLER_COLS, guide = guide_legend(order = 7)) +
new_scale_fill() +
geom_tile(aes(x = "% species in OG", y = Orthogroup, fill = percent_total_ids)) +
scale_fill_gradient2(low = "#F7FCF5", high = "#00441B", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 2)) +
new_scale_fill() +
geom_tile(aes(x = "% supergroups in OG", y = Orthogroup, fill = percent_total_supergroups)) +
scale_fill_gradient2(low = "#EFF3FF", high = "#08519C", limits = c(0, 100), midpoint = 25, guide = guide_colorbar(order = 3)) +
new_scale_fill() +
geom_tile(aes(x = "vs OG source", y = Orthogroup, fill = vs_OG_source)) +
scale_fill_manual(values = OG_CALLER_COLS, guide = guide_legend(order = 8)) +
theme_bw() +
theme(legend.position = "bottom",
legend.direction = "vertical",
legend.title = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)) +
facet_grid(most_common_pfam_hmm_name ~ ., scales = "free", space = "free")
up_in_ogs <- list()
# complex upset - tries to use column named id as data by default
# make matrices - for supergroups
up_in_ogs$sg <- create_presence_matrix(main_seq_long, supergroup, Orthogroup)
up_in_ogs$sg_cols <- colnames(up_in_ogs$sg)[-1]
up_in_ogs$sg <- up_in_ogs$sg %>% left_join(og_meta %>% rename(species = id), by = "Orthogroup")
# Prepare the colours for the sets, some R packages are so annoying!!!
queries_list <- purrr::map(names(SUPERGROUP_COLS), ~{
upset_query(set = .x, color = SUPERGROUP_COLS[.x], fill = SUPERGROUP_COLS[.x])
}) %>%
purrr::keep(~ .x$set %in% up_in_ogs$sg_cols)
up_ogs_by_sg <- upset(
up_in_ogs$sg,
up_in_ogs$sg_cols,
annotations = list(
'OG size' = (
ggplot(mapping = aes(y = og_size))
+ geom_jitter(aes(color = OG_source), na.rm = TRUE)
+ geom_violin(alpha = 0, na.rm = TRUE)
+ scale_y_continuous(trans = 'log10', labels = scales::comma)
+ scale_color_manual(values = OG_CALLER_COLS)
),
'OG scaled Homogeneity Score' = (
# note that aes(x=intersection) is supplied by default
ggplot(mapping = aes(y = cogeqc_hscore_scaled_all))
+ geom_jitter(aes(color = OG_source), na.rm = TRUE)
+ geom_violin(alpha = 0, na.rm = TRUE)
+ scale_color_manual(values = OG_CALLER_COLS)
),
'OG source' = (
ggplot(mapping = aes(fill = up_in_ogs$sg$OG_source))
+ geom_bar(stat = 'count', position = 'fill')
+ scale_fill_manual(values = OG_CALLER_COLS)
+ ylab('OG source')
)
),
min_size = 10,
width_ratio = 0.1,
sort_intersections_by = 'degree',
queries = queries_list
)
# make matrices - for geneclasses
up_in_ogs$gc <- create_presence_matrix(main_seq_long, pfam_hmm_name, Orthogroup)
up_in_ogs$gc_cols <- colnames(up_in_ogs$gc)[-1]
up_in_ogs$gc <- up_in_ogs$gc %>% left_join(og_meta %>% rename(species = id), by = "Orthogroup")
up_ogs_by_gc <- upset(
up_in_ogs$gc,
up_in_ogs$gc_cols,
annotations = list(
'OG size' = (
ggplot(mapping = aes(y = og_size))
+ geom_jitter(aes(color = OG_source), na.rm = TRUE)
+ geom_violin(alpha = 0, na.rm = TRUE)
+ scale_y_continuous(trans = 'log10', labels = scales::comma)
+ scale_color_manual(values = OG_CALLER_COLS)
),
'OG scaled Homogeneity Score' = (
# note that aes(x=intersection) is supplied by default
ggplot(mapping = aes(y = cogeqc_hscore_scaled_all))
+ geom_jitter(aes(color = OG_source), na.rm = TRUE)
+ geom_violin(alpha = 0, na.rm = TRUE)
+ scale_color_manual(values = OG_CALLER_COLS)
),
'OG source' = (
ggplot(mapping = aes(fill = up_in_ogs$gc$OG_source))
+ geom_bar(stat = 'count', position = 'fill')
+ scale_fill_manual(values = OG_CALLER_COLS)
+ ylab('OG source')
)
),
min_size = 5, # this is really important as it controls how many are shown
width_ratio = 0.1,
sort_intersections_by = 'degree'
)
Overall Comparison of
OG callers
Orthogroup callers
overall OG composition Similarity
jaccard_metrics_heat_callers

OG Homogeneity by OG
callers
## Warning: Removed 489 rows containing non-finite outside the scale range
## (`stat_ydensity()`).
## Warning: Removed 489 rows containing missing values or values outside the scale range
## (`geom_point()`).

OG Homogeneity by
Species
## Warning: Removed 809 rows containing non-finite outside the scale range
## (`stat_ydensity()`).

Ortholog
Properties
Orthogroup
statistics

Orthogroup
homogeneity score vs OG size
sp_ogs_coghs_vs_size_vs_sgs
## Warning: Removed 489 rows containing missing values or values outside the scale range
## (`geom_point()`).

Ortholog
Distribution
Species tree

Species tree by
geneclass

OG distribution
across supergroups
## Warning: Removed 81 rows containing non-finite outside the scale range
## (`stat_count()`).

OG distribution
across gene classes
## Warning: Groups with fewer than two datapoints have been dropped.
## ℹ Set `drop = FALSE` to consider such groups for position adjustment purposes.
## Warning: Removed 220 rows containing non-finite outside the scale range
## (`stat_count()`).
